import numpy as np
import torch
import matplotlib.pyplot as plt
import statistics
import time
from torch.utils.data import DataLoader, TensorDataset, random_split
from scipy.stats import pearsonr
import copy
from GenerateBeta import GenerateBeta
from EmbDataSet import EmbDataSet
from GD import GD
from SGD import SGD
from CVSGD import CVSGD
from ANTICVSGD import ANTICVSGD

NumExp = 5
NumS = 10
NumSet = 2
NumRun = 2
eps = 2500
bs = 5
eva_bs = 5
lr = 0.02
K = 4
KBS = 40
d = 100
emb = 100
nz = 5
N = 80
EpTimes = 8
NumCan = 4
TrainRatio = 0.5
Replacement = False
RandomBeta = True
#ManualSeeds = np.random.randint(100, size=NumRun)
Colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

starting_time = time.time()
torch.manual_seed(123)
np.random.seed(123)
ManualSeeds = np.random.randint(100, size=NumRun)
BetaSeeds = np.random.randint(100, size=NumS)
DataSetSeeds = np.random.randint(100, size=NumS * 2)

#x_train, y_train, x_test, y_test, d, N_train, N_test = EmbDataSet(RandomSeed=DataSetSeeds[0], d=100, emb=50, N=80)
#x_train_alt, y_train_alt, _, _, _, _, _ = EmbDataSet(RandomSeed=DataSetSeeds[1], d=100, emb=50, N=80)
#print(y_train)
#print(y_train_alt)
#dataset = TensorDataset(torch.arange(40).view(-1, 1))
#loader = DataLoader(dataset, shuffle=True, batch_size=10)
N_train = int(N * TrainRatio)
GDites = int(eps * EpTimes)
GDSecondMoment = [np.zeros(GDites) for x in range(NumExp)]
GDFirstMoment = [np.zeros(GDites) for x in range(NumExp)]
GDTrainLossAve = [np.zeros(GDites) for x in range(NumExp)]
GDTestLossAve = [np.zeros(GDites) for x in range(NumExp)]
GDGradientVarianceAve = [np.zeros(GDites) for x in range(NumExp)]
GDAccGradientVarianceAve = [np.zeros(GDites) for x in range(NumExp)]
GDBootstrapLossAve = [np.zeros(GDites) for x in range(NumExp)]
GDProductsAve = [np.zeros(GDites) for x in range(NumExp)]
GDHessianFrobeniusesAve = [np.zeros(GDites) for x in range(NumExp)]
GDHessianTracesAve = [np.zeros(GDites) for x in range(NumExp)]
GDCovarianceTracesAve = [np.zeros(GDites) for x in range(NumExp)]
GDAccProductsAve = [np.zeros(GDites) for x in range(NumExp)]
SGDites = int(eps * N_train / bs)
SGDSecondMoment = [np.zeros(SGDites) for x in range(NumExp)]
SGDFirstMoment = [np.zeros(SGDites) for x in range(NumExp)]
SGDTrainLossAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDTestLossAve = [np.zeros(SGDites) for x in range(NumExp)]
#AccFirstMoment = [np.zeros(ites) for x in range(NumExp)]
#AccSecondMoment = [np.zeros(ites) for x in range(NumExp)]
SGDGradientVarianceAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDAccGradientVarianceAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDBootstrapLossAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDProductsAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDHessianFrobeniusesAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDHessianTracesAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDCovarianceTracesAve = [np.zeros(SGDites) for x in range(NumExp)]
SGDAccProductsAve = [np.zeros(GDites) for x in range(NumExp)]
'''
CVSGDites = int(eps * N_train / bs)# / NumCan)
CVSGDSecondMoment = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDFirstMoment = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDTrainLossAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDTestLossAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDGradientVarianceAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDAccGradientVarianceAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDBootstrapLossAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDProductsAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDHessianFrobeniusesAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDHessianTracesAve = [np.zeros(CVSGDites) for x in range(NumExp)]
CVSGDCovarianceTracesAve = [np.zeros(CVSGDites) for x in range(NumExp)]
ANTICVSGDites = int(eps * N_train / bs)# / NumCan)
ANTICVSGDSecondMoment = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDFirstMoment = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDTrainLossAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDTestLossAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDGradientVarianceAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDAccGradientVarianceAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDBootstrapLossAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDProductsAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDHessianFrobeniusesAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDHessianTracesAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
ANTICVSGDCovarianceTracesAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
'''
# Initialize the diagonal linear models
class DiagonalLinear(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(DiagonalLinear, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize, bias=False)
        self.linearminus = torch.nn.Linear(inputSize, outputSize, bias=False)
        #self.float()
        #Initialization of the model parameters
        #torch.nn.init.xavier_uniform_(self.linear.weight)
        torch.nn.init.kaiming_uniform_(self.linear.weight)
        torch.nn.init.kaiming_uniform_(self.linearminus.weight)

    def forward(self, x):
        #beta = torch.mul(self.linear.weight.data, self.linear.weight.data)
               #- torch.matmul(torch.diag(self.linearminus.weight.data), torch.diag(self.linearminus.weight.data))
        out = torch.matmul(x, torch.square(self.linear.weight).T-torch.square(self.linearminus.weight).T)
        return out

DiaModels = []
for run in range(NumExp):
    DiaModels.append(DiagonalLinear(d, 1))

#Initialize the 2-layer neural network with ReLU
class ReLUNet(torch.nn.Module):
    def __init__(self, num_layers, input_sizes):
        super(ReLUNet, self).__init__()
        self.num_layers = num_layers
        self.layers = torch.nn.ModuleList()
        for i in range(num_layers):
            self.layers.append((torch.nn.Linear(input_sizes[i], input_sizes[i + 1], bias=False)))
            #torch.nn.init.kaiming_uniform_(self.layers[i].weight)
            #torch.nn.init.xavier_uniform_(self.layers[i].weight)
            #torch.nn.init.normal_(self.layers[i].weight, mean=0, std=0.1)
            torch.nn.init.xavier_normal_(self.layers[i].weight)
        self.relu = torch.nn.ReLU()
    def forward(self, x):
        for i in range(self.num_layers - 1):
            x = self.layers[i](x)
            x = self.relu(x)
        x = self.layers[self.num_layers - 1](x)
        return x

Num_Layers = 2
Input_Sizes = [d, 5, 1]
ReLUModels = []
for run in range(NumExp):
    ReLUModels.append(ReLUNet(num_layers=Num_Layers, input_sizes=Input_Sizes))

Models = DiaModels
for exp in range(NumExp):
    for s in range(NumS):
        beta = GenerateBeta(d=d, nz=nz, seed=BetaSeeds[s], Random=RandomBeta)
        x_train, y_train, x_test, y_test, d, N_train, N_test = EmbDataSet(beta=beta, RandomSeed=DataSetSeeds[s * 2], d=d, emb=emb,
                                                                          N=N, TrainRatio=TrainRatio)
        x_train_alt, y_train_alt, _, _, _, _, _ = EmbDataSet(beta=beta, RandomSeed=DataSetSeeds[s * 2 + 1], d=d, emb=emb, N=N, TrainRatio=TrainRatio)
        for i in range(NumSet):
            # Construct S(i)
            x_train_i = copy.copy(x_train)
            y_train_i = copy.copy(y_train)
            x_train_i[i] = x_train_alt[i]
            y_train_i[i] = y_train_alt[i]
            # GD
            GDWTs, GDTrainLosses, GDTestLosses, GDGradientVariances, GDBootstrapLoss, GDProducts, GDHessianFrobeniuses, GDHessianTraces, GDCovarianceTraces, GDAccProducts = GD(ini_model=Models[exp],
                                                                         x_train=x_train, y_train=y_train,
                                                                         x_test=x_test,
                                                                         y_test=y_test,
                                                                         d=d, N_train=N_train, eps=eps, bs=bs,
                                                                         learningrate=lr,
                                                                         Replacement=Replacement,
                                                                         seed=ManualSeeds[0], K=K, KBS=KBS,
                                                                         EpTimes=EpTimes, ComputeGV=True, ComputeBL=True)
            GDWT_is, _, _, _, _, _, _, _, _, _ = GD(ini_model=Models[exp],
                                  x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                                  y_test=y_test,
                                  d=d, N_train=N_train, eps=eps, bs=bs, learningrate=lr,
                                  Replacement=Replacement, seed=ManualSeeds[0], K=K, KBS=KBS, EpTimes=EpTimes,
                                  ComputeGV=False, ComputeBL=False)
            GDTrainLossAve[exp] += GDTrainLosses
            GDTestLossAve[exp] += GDTestLosses
            GDGradientVarianceAve[exp] += GDGradientVariances
            GDBootstrapLossAve[exp] += GDBootstrapLoss
            GDProductsAve[exp] += GDProducts
            GDHessianFrobeniusesAve[exp] += GDHessianFrobeniuses
            GDHessianTracesAve[exp] += GDHessianTraces
            GDCovarianceTracesAve[exp] += GDCovarianceTraces
            GDAccProductsAve[exp] += GDAccProducts
            Residuals = GDWTs - GDWT_is
            Norms = torch.norm(Residuals, p=2, dim=1)
            # print(Residual)
            for ite in range(GDites):
                GDFirstMoment[exp][ite] += Norms[ite]
                GDSecondMoment[exp][ite] += Norms[ite] ** 2
            for run in range(NumRun):
                # SGD
                SGDWTs, SGDTrainLosses, SGDTestLosses, SGDGradientVariances, SGDBootstrapLoss, SGDProducts, SGDHessianFrobeniuses, SGDHessianTraces, SGDCovarianceTraces, SGDAccProducts = SGD(ini_model=Models[exp],
                                                                      x_train=x_train, y_train=y_train, x_test=x_test,
                                                                      y_test=y_test,
                                                                      d=d, N_train=N_train, eps=eps, bs=bs, learningrate=lr,
                                                                      Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS,
                                                                      ComputeGV=True, ComputeBL=True)
                SGDWT_is, _, _, _, _, _, _, _, _, _ = SGD(ini_model=Models[exp],
                         x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                         y_test=y_test,
                         d=d, N_train=N_train, eps=eps, bs=bs, learningrate=lr,
                         Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS, ComputeGV=False, ComputeBL=False)
                SGDTrainLossAve[exp] += SGDTrainLosses
                SGDTestLossAve[exp] += SGDTestLosses
                SGDGradientVarianceAve[exp] += SGDGradientVariances
                SGDBootstrapLossAve[exp] += SGDBootstrapLoss
                SGDProductsAve[exp] += SGDProducts
                SGDHessianFrobeniusesAve[exp] += SGDHessianFrobeniuses
                SGDHessianTracesAve[exp] += SGDHessianTraces
                SGDCovarianceTracesAve[exp] += SGDCovarianceTraces
                SGDAccProductsAve[exp] += SGDAccProducts
                Residuals = SGDWTs - SGDWT_is
                Norms = torch.norm(Residuals, p=2, dim=1)
                #print(Residual)
                for ite in range(SGDites):
                    SGDFirstMoment[exp][ite] += Norms[ite]
                    SGDSecondMoment[exp][ite] += Norms[ite] ** 2
                '''
                #CVSGD
                CVSGDWTs, CVSGDTrainLosses, CVSGDTestLosses, CVSGDGradientVariances, CVSGDBootstrapLoss, CVSGDProducts, CVSGDHessianFrobeniuses, CVSGDHessianTraces, CVSGDCovarianceTraces = CVSGD(
                    ini_model=Models[exp],
                    x_train=x_train, y_train=y_train, x_test=x_test,
                    y_test=y_test,
                    d=d, N_train=N_train, eps=eps,#int(eps/NumCan),
                    bs=bs, eva_bs=eva_bs, learningrate=lr,
                    Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS, NumCan=NumCan,
                    ComputeGV=True, ComputeBL=True)
                CVSGDWT_is, _, _, _, _, _, _, _, _ = CVSGD(ini_model=Models[exp],
                                           x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                                           y_test=y_test,
                                           d=d, N_train=N_train, eps=eps,#int(eps/NumCan),
                                               bs=bs, eva_bs=eva_bs, learningrate=lr,
                                           Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS, NumCan=NumCan,
                                           ComputeGV=False, ComputeBL=False)
                CVSGDTrainLossAve[exp] += CVSGDTrainLosses
                CVSGDTestLossAve[exp] += CVSGDTestLosses
                CVSGDGradientVarianceAve[exp] += CVSGDGradientVariances
                CVSGDBootstrapLossAve[exp] += CVSGDBootstrapLoss
                CVSGDProductsAve[exp] += CVSGDProducts
                CVSGDHessianFrobeniusesAve[exp] += CVSGDHessianFrobeniuses
                CVSGDHessianTracesAve[exp] += CVSGDHessianTraces
                CVSGDCovarianceTracesAve[exp] += CVSGDCovarianceTraces
                Residuals = CVSGDWTs - CVSGDWT_is
                Norms = torch.norm(Residuals, p=2, dim=1)
                # print(Residual)
                for ite in range(CVSGDites):
                    CVSGDFirstMoment[exp][ite] += Norms[ite]
                    CVSGDSecondMoment[exp][ite] += Norms[ite] ** 2
                # ANTICVSGD
                ANTICVSGDWTs, ANTICVSGDTrainLosses, ANTICVSGDTestLosses, ANTICVSGDGradientVariances, ANTICVSGDBootstrapLoss, ANTICVSGDProducts, ANTICVSGDHessianFrobeniuses, ANTICVSGDHessianTraces, ANTICVSGDCovarianceTraces = ANTICVSGD(
                    ini_model=Models[exp],
                    x_train=x_train, y_train=y_train, x_test=x_test,
                    y_test=y_test,
                    d=d, N_train=N_train, eps=eps,  # int(eps/NumCan),
                    bs=bs, eva_bs=eva_bs, learningrate=lr,
                    Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS, NumCan=NumCan,
                    ComputeGV=True, ComputeBL=True)
                ANTICVSGDWT_is, _, _, _, _, _, _, _, _ = ANTICVSGD(ini_model=Models[exp],
                                                  x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                                                  y_test=y_test,
                                                  d=d, N_train=N_train, eps=eps,  # int(eps/NumCan),
                                                  bs=bs, eva_bs=eva_bs, learningrate=lr,
                                                  Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS,
                                                  NumCan=NumCan,
                                                  ComputeGV=False, ComputeBL=False)
                ANTICVSGDTrainLossAve[exp] += ANTICVSGDTrainLosses
                ANTICVSGDTestLossAve[exp] += ANTICVSGDTestLosses
                ANTICVSGDGradientVarianceAve[exp] += ANTICVSGDGradientVariances
                ANTICVSGDBootstrapLossAve[exp] += ANTICVSGDBootstrapLoss
                ANTICVSGDProductsAve[exp] += ANTICVSGDProducts
                ANTICVSGDHessianFrobeniusesAve[exp] += ANTICVSGDHessianFrobeniuses
                ANTICVSGDHessianTracesAve[exp] += ANTICVSGDHessianTraces
                ANTICVSGDCovarianceTracesAve[exp] += ANTICVSGDCovarianceTraces
                Residuals = ANTICVSGDWTs - ANTICVSGDWT_is
                Norms = torch.norm(Residuals, p=2, dim=1)
                # print(Residual)
                for ite in range(CVSGDites):
                    ANTICVSGDFirstMoment[exp][ite] += Norms[ite]
                    ANTICVSGDSecondMoment[exp][ite] += Norms[ite] ** 2
                '''
    GDFirstMoment[exp] /= (NumS * NumSet)
    GDSecondMoment[exp] /= (NumS * NumSet)
    GDTrainLossAve[exp] /= (NumS * NumSet)
    GDTestLossAve[exp] /= (NumS * NumSet)
    GDGradientVarianceAve[exp] /= (NumS * NumSet)
    GDBootstrapLossAve[exp] /= (NumS * NumSet)
    GDProductsAve[exp] /= (NumS * NumSet)
    GDHessianFrobeniusesAve[exp] /= (NumS * NumSet)
    GDHessianTracesAve[exp] /= (NumS * NumSet)
    GDCovarianceTracesAve[exp] /= (NumS * NumSet)
    GDAccProductsAve[exp] /= (NumS * NumSet)
    GDAccGradientVarianceAve[exp] = copy.copy(GDGradientVarianceAve[exp])
    SGDFirstMoment[exp] /= (NumS * NumSet * NumRun)
    SGDSecondMoment[exp] /= (NumS * NumSet * NumRun)
    SGDTrainLossAve[exp] /= (NumS * NumSet * NumRun)
    SGDTestLossAve[exp] /= (NumS * NumSet * NumRun)
    SGDGradientVarianceAve[exp] /= (NumS * NumSet * NumRun)
    SGDBootstrapLossAve[exp] /= (NumS * NumSet * NumRun)
    SGDProductsAve[exp] /= (NumS * NumSet * NumRun)
    SGDHessianFrobeniusesAve[exp] /= (NumS * NumSet * NumRun)
    SGDHessianTracesAve[exp] /= (NumS * NumSet * NumRun)
    SGDCovarianceTracesAve[exp] /= (NumS * NumSet * NumRun)
    SGDAccProductsAve[exp] /= (NumS * NumSet * NumRun)
    SGDAccGradientVarianceAve[exp] = copy.copy(SGDGradientVarianceAve[exp])
    '''
    CVSGDFirstMoment[exp] /= (NumS * NumSet * NumRun)
    CVSGDSecondMoment[exp] /= (NumS * NumSet * NumRun)
    CVSGDTrainLossAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDTestLossAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDGradientVarianceAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDBootstrapLossAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDProductsAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDHessianFrobeniusesAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDHessianTracesAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDCovarianceTracesAve[exp] /= (NumS * NumSet * NumRun)
    CVSGDAccGradientVarianceAve[exp] = copy.copy(CVSGDGradientVarianceAve[exp])
    ANTICVSGDFirstMoment[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDSecondMoment[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDTrainLossAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDTestLossAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDGradientVarianceAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDBootstrapLossAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDProductsAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDHessianFrobeniusesAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDHessianTracesAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDCovarianceTracesAve[exp] /= (NumS * NumSet * NumRun)
    ANTICVSGDAccGradientVarianceAve[exp] = copy.copy(ANTICVSGDGradientVarianceAve[exp])
    '''
    #AccFirstMoment[exp] = copy.copy(FirstMoment[exp])
    #AccFirstMoment[exp] = AccFirstMoment[exp] ** 2
    #AccSecondMoment[exp] = copy.copy(SecondMoment[exp])
    for ite in range(GDites):
        if ite > 0:
            GDAccGradientVarianceAve[exp][ite] = GDAccGradientVarianceAve[exp][ite] + GDAccGradientVarianceAve[exp][ite - 1]
    for ite in range(SGDites):
        if ite > 0:
            SGDAccGradientVarianceAve[exp][ite] = SGDAccGradientVarianceAve[exp][ite] + SGDAccGradientVarianceAve[exp][ite - 1]
    '''
    for ite in range(CVSGDites):
        if ite > 0:
            CVSGDAccGradientVarianceAve[exp][ite] = CVSGDAccGradientVarianceAve[exp][ite] + CVSGDAccGradientVarianceAve[exp][ite - 1]
    for ite in range(ANTICVSGDites):
        if ite > 0:
            ANTICVSGDAccGradientVarianceAve[exp][ite] = ANTICVSGDAccGradientVarianceAve[exp][ite] + ANTICVSGDAccGradientVarianceAve[exp][ite - 1]
    '''
    #        AccFirstMoment[exp][ite] = AccFirstMoment[exp][ite] + AccFirstMoment[exp][ite - 1]
    #        AccSecondMoment[exp][ite] = AccSecondMoment[exp][ite] + AccSecondMoment[exp][ite - 1]
#print("The Squared First Moment is {}".format(FirstMoment ** 2))
#print("The Second Moment is {}".format(SecondMoment))
#print("The Accumulated Second Moment is {}".format(AccSecondMoment))

#print(TestLosses)
#GD plots
plt.plot(range(GDites), GDFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
plt.plot(range(GDites), GDSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
    plt.plot(range(GDites), GDSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("First and Second Moments of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
#plt.ylabel("Train Gradient Variances")
plt.savefig("GDMoments")
plt.close()

plt.plot(range(GDites), GDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Losses of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss")
plt.savefig("GDTrainLosses")
plt.close()

plt.plot(range(GDites), GDTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Test Losses of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Test Loss")
plt.savefig("GDTestLosses")
plt.close()

plt.plot(range(GDites), GDTestLossAve[0] - GDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDTestLossAve[exp] - GDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Generalization Gaps of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Generalization Gap")
plt.savefig("GDGeneralizationGaps")
plt.close()

plt.plot(range(GDites), GDBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Loss Variance of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss Variance")
plt.savefig("GDTrainLossVariance")
plt.close()

plt.plot(range(GDites), GDGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Gradient Variances of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Gradient Variance")
plt.savefig("GDGradientVariances")
plt.close()

plt.plot(range(GDites), GDAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Accumulated Gradient Variances of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Accumulated Gradient Variance")
plt.savefig("GDAccGradientVariances")
plt.close()

plt.plot(range(GDites), GDProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Trace Product of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Trace Product")
plt.savefig("GDTraceProducts")
plt.close()

plt.plot(range(GDites), GDHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Frobenius of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Frobenius")
plt.savefig("GDHessianFrobeniuses")
plt.close()

plt.plot(range(GDites), GDHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Trace of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Trace")
plt.savefig("GDHessianTraces")
plt.close()

plt.plot(range(GDites), GDCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Covariance Matrix Trace of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Covariance Matrix Trace")
plt.savefig("GDCovarianceMatrixTraces")
plt.close()

plt.plot(range(GDites), GDAccProductsAve[0], color=Colors[0], linestyle='solid', label='Acc Trace Product Average 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDAccProductsAve[exp], color=Colors[exp], linestyle='solid', label='Acc Trace Product Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Accumulated Trace Product of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Accumulated Trace Product")
plt.savefig("GDAccTraceProducts")
plt.close()

plt.plot(range(GDites), GDSecondMoment[0] / GDFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
for exp in range(1, NumExp):
    plt.plot(range(GDites), GDSecondMoment[exp] / GDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("Ratio between the Second Moments and the Squared First Moments of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Ratio")
plt.savefig("GDMomentRatios")
plt.close()

# SGD plots
plt.plot(range(SGDites), SGDFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
plt.plot(range(SGDites), SGDSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
    plt.plot(range(SGDites), SGDSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("First and Second Moments of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
#plt.ylabel("Train Gradient Variances")
plt.savefig("SGDMoments")
plt.close()

plt.plot(range(SGDites), SGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Losses of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss")
plt.savefig("SGDTrainLosses")
plt.close()

plt.plot(range(SGDites), SGDTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Test Losses of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Test Loss")
plt.savefig("SGDTestLosses")
plt.close()

plt.plot(range(SGDites), SGDTestLossAve[0] - SGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDTestLossAve[exp] - SGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Generalization Gaps of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Generalization Gap")
plt.savefig("SGDGeneralizationGaps")
plt.close()

plt.plot(range(SGDites), SGDBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Loss Variance of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss Variance")
plt.savefig("SGDTrainLossVariance")
plt.close()

plt.plot(range(SGDites), SGDGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Gradient Variances of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Gradient Variance")
plt.savefig("SGDGradientVariances")
plt.close()

plt.plot(range(SGDites), SGDAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Accumulated Gradient Variances of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Accumulated Gradient Variance")
plt.savefig("SGDAccGradientVariances")
plt.close()

plt.plot(range(SGDites), SGDProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Trace Product of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Trace Product")
plt.savefig("SGDTraceProducts")
plt.close()

plt.plot(range(SGDites), SGDHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Frobenius of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Frobenius")
plt.savefig("SGDHessianFrobeniuses")
plt.close()

plt.plot(range(SGDites), SGDHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Trace of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Trace")
plt.savefig("SGDHessianTraces")
plt.close()

plt.plot(range(SGDites), SGDCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Covariance Matrix Trace of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Covariance Matrix Trace")
plt.savefig("SGDCovarianceMatrixTraces")
plt.close()

plt.plot(range(SGDites), SGDAccProductsAve[0], color=Colors[0], linestyle='solid', label='Acc Trace Product Average 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDAccProductsAve[exp], color=Colors[exp], linestyle='solid', label='Acc Trace Product Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Accumulated Trace Product of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Accumulated Trace Product")
plt.savefig("SGDAccTraceProducts")
plt.close()

plt.plot(range(SGDites), SGDSecondMoment[0] / SGDFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
for exp in range(1, NumExp):
    plt.plot(range(SGDites), SGDSecondMoment[exp] / SGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("Ratio between the Second Moments and the Squared First Moments of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Ratio")
plt.savefig("SGDMomentRatios")
plt.close()
'''
#CVSGD plots
plt.plot(range(CVSGDites), CVSGDFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
plt.plot(range(CVSGDites), CVSGDSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
    plt.plot(range(CVSGDites), CVSGDSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("First and Second Moments of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
#plt.ylabel("Train Gradient Variances")
plt.savefig("CVSGDMoments")
plt.close()

plt.plot(range(CVSGDites), CVSGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Losses of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss")
plt.savefig("CVSGDTrainLosses")
plt.close()

plt.plot(range(CVSGDites), CVSGDTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Test Losses of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Test Loss")
plt.savefig("CVSGDTestLosses")
plt.close()

plt.plot(range(CVSGDites), CVSGDTestLossAve[0] - CVSGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDTestLossAve[exp] - CVSGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Generalization Gaps of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Generalization Gap")
plt.savefig("CVSGDGeneralizationGaps")
plt.close()

plt.plot(range(CVSGDites), CVSGDBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Loss Variance of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss Variance")
plt.savefig("CVSGDTrainLossVariance")
plt.close()

plt.plot(range(CVSGDites), CVSGDGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Gradient Variances of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Gradient Variance")
plt.savefig("CVSGDGradientVariances")
plt.close()

plt.plot(range(CVSGDites), CVSGDAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Accumulated Gradient Variances of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Accumulated Gradient Variance")
plt.savefig("CVSGDAccGradientVariances")
plt.close()

plt.plot(range(CVSGDites), CVSGDProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Trace Products of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Trace Product")
plt.savefig("CVSGDTraceProducts")
plt.close()

plt.plot(range(CVSGDites), CVSGDHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Frobenius of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Frobenius")
plt.savefig("CVSGDHessianFrobeniuses")
plt.close()

plt.plot(range(CVSGDites), CVSGDHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Trace of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Trace")
plt.savefig("CVSGDHessianTraces")
plt.close()

plt.plot(range(CVSGDites), CVSGDCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Covariance Matrix Trace of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Covariance Matrix Trace")
plt.savefig("CVSGDCovarianceMatrixTraces")
plt.close()

plt.plot(range(CVSGDites), CVSGDSecondMoment[0] / CVSGDFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
for exp in range(1, NumExp):
    plt.plot(range(CVSGDites), CVSGDSecondMoment[exp] / CVSGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("Ratio between the Second Moments and the Squared First Moments of CVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Ratio")
plt.savefig("CVSGDMomentRatios")
plt.close()

#ANTICVSGD plots
plt.plot(range(ANTICVSGDites), ANTICVSGDFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
plt.plot(range(ANTICVSGDites), ANTICVSGDSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
    plt.plot(range(ANTICVSGDites), ANTICVSGDSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("First and Second Moments of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
#plt.ylabel("Train Gradient Variances")
plt.savefig("ANTICVSGDMoments")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Losses of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss")
plt.savefig("ANTICVSGDTrainLosses")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Test Losses of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Test Loss")
plt.savefig("ANTICVSGDTestLosses")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDTestLossAve[0] - ANTICVSGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDTestLossAve[exp] - ANTICVSGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Generalization Gaps of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Generalization Gap")
plt.savefig("ANTICVSGDGeneralizationGaps")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Train Loss Variance of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Train Loss Variance")
plt.savefig("ANTICVSGDTrainLossVariance")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Gradient Variances of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Gradient Variance")
plt.savefig("ANTICVSGDGradientVariances")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Accumulated Gradient Variances of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Accumulated Gradient Variance")
plt.savefig("ANTICVSGDAccGradientVariances")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Trace Products of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Trace Product")
plt.savefig("ANTICVSGDTraceProducts")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Frobenius of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Frobenius")
plt.savefig("ANTICVSGDHessianFrobeniuses")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Hessian Trace of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Hessian Trace")
plt.savefig("ANTICVSGDHessianTraces")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
plt.legend()
plt.yscale('log')
plt.title("Average Covariance Matrix Trace of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Covariance Matrix Trace")
plt.savefig("ANTICVSGDCovarianceMatrixTraces")
plt.close()

plt.plot(range(ANTICVSGDites), ANTICVSGDSecondMoment[0] / ANTICVSGDFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
for exp in range(1, NumExp):
    plt.plot(range(ANTICVSGDites), ANTICVSGDSecondMoment[exp] / ANTICVSGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
plt.legend()
#plt.yscale('log')
plt.title("Ratio between the Second Moments and the Squared First Moments of ANTICVSGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
plt.xlabel("Iteration")
plt.ylabel("Ratio")
plt.savefig("ANTICVSGDMomentRatios")
plt.close()
'''
print("Run time is {}".format(time.time() - starting_time))
print("Last 1000")
for j in range(NumExp):
    print("GD {} Generalization Gap is {}".format(j+1, GDTestLossAve[j][-1000:-1].mean() - GDTrainLossAve[j][-1000:-1].mean()))
    print("GD {} Moment is {}".format(j+1, GDSecondMoment[j][-1000:-1].mean()))
    print("GD {} Accumulated Gradient Variance is {}".format(j+1, GDAccGradientVarianceAve[j][-1000:-1].mean()))
    print("GD {} Training Loss Variance is {}".format(j + 1, GDBootstrapLossAve[j][-1000:-1].mean()))
    print("GD {} Training Loss is {}".format(j + 1, GDTrainLossAve[j][-1000:-1].mean()))
    print("GD {} Trace Product is {}".format(j + 1, GDProductsAve[j][-1000:-1].mean()))
    print("GD {} Hessian Frobenius is {}".format(j + 1, GDHessianFrobeniusesAve[j][-1000:-1].mean()))
    print("GD {} Hessian Trace is {}".format(j + 1, GDHessianTracesAve[j][-1000:-1].mean()))
    print("GD {} Covariance Matrix Trace is {}".format(j + 1, GDCovarianceTracesAve[j][-1000:-1].mean()))
    print("GD {} Accumulated Trace Product is {}".format(j + 1, GDAccProductsAve[j][-1000:-1].mean()))

for j in range(NumExp):
    print("SGD {} Generalization Gap is {}".format(j+1, SGDTestLossAve[j][-1000:-1].mean() - SGDTrainLossAve[j][-1000:-1].mean()))
    print("SGD {} Moment is {}".format(j+1, SGDSecondMoment[j][-1000:-1].mean()))
    print("SGD {} Accumulated Gradient Variance is {}".format(j+1, SGDAccGradientVarianceAve[j][-1000:-1].mean()))
    print("SGD {} Training Loss Variance is {}".format(j + 1, SGDBootstrapLossAve[j][-1000:-1].mean()))
    print("SGD {} Training Loss is {}".format(j + 1, SGDTrainLossAve[j][-1000:-1].mean()))
    print("SGD {} Trace Product is {}".format(j + 1, SGDProductsAve[j][-1000:-1].mean()))
    print("SGD {} Hessian Frobenius is {}".format(j + 1, SGDHessianFrobeniusesAve[j][-1000:-1].mean()))
    print("SGD {} Hessian Trace is {}".format(j + 1, SGDHessianTracesAve[j][-1000:-1].mean()))
    print("SGD {} Covariance Matrix Trace is {}".format(j + 1, SGDCovarianceTracesAve[j][-1000:-1].mean()))
    print("SGD {} Accumulated Trace Product is {}".format(j + 1, SGDAccProductsAve[j][-1000:-1].mean()))

'''
for j in range(NumExp):
    print("CVSGD {} Generalization Gap is {}".format(j+1, CVSGDTestLossAve[j][-1000:-1].mean() - CVSGDTrainLossAve[j][-1000:-1].mean()))
    print("CVSGD {} Moment is {}".format(j+1, CVSGDSecondMoment[j][-1000:-1].mean()))
    print("CVSGD {} Accumulated Gradient Variance is {}".format(j+1, CVSGDAccGradientVarianceAve[j][-1000:-1].mean()))
    print("CVSGD {} Training Loss Variance is {}".format(j + 1, CVSGDBootstrapLossAve[j][-1000:-1].mean()))
    print("CVSGD {} Training Loss is {}".format(j + 1, CVSGDTrainLossAve[j][-1000:-1].mean()))
    print("CVSGD {} Trace Product is {}".format(j + 1, CVSGDProductsAve[j][-1000:-1].mean()))
    print("CVSGD {} Hessian Frobenius is {}".format(j + 1, CVSGDHessianFrobeniusesAve[j][-1000:-1].mean()))
    print("CVSGD {} Hessian Trace is {}".format(j + 1, CVSGDHessianTracesAve[j][-1000:-1].mean()))
    print("CVSGD {} Covariance Matrix Trace is {}".format(j + 1, CVSGDCovarianceTracesAve[j][-1000:-1].mean()))

for j in range(NumExp):
    print("ANTICVSGD {} Generalization Gap is {}".format(j+1, ANTICVSGDTestLossAve[j][-1000:-1].mean() - ANTICVSGDTrainLossAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Moment is {}".format(j+1, ANTICVSGDSecondMoment[j][-1000:-1].mean()))
    print("ANTICVSGD {} Accumulated Gradient Variance is {}".format(j+1, ANTICVSGDAccGradientVarianceAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Training Loss Variance is {}".format(j + 1, ANTICVSGDBootstrapLossAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Training Loss is {}".format(j + 1, ANTICVSGDTrainLossAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Trace Product is {}".format(j + 1, ANTICVSGDProductsAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Hessian Frobenius is {}".format(j + 1, ANTICVSGDHessianFrobeniusesAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Hessian Trace is {}".format(j + 1, ANTICVSGDHessianTracesAve[j][-1000:-1].mean()))
    print("ANTICVSGD {} Covariance Matrix Trace is {}".format(j + 1, ANTICVSGDCovarianceTracesAve[j][-1000:-1].mean()))
'''
print("Last 2000")
for j in range(NumExp):
    print("GD {} Generalization Gap is {}".format(j+1, GDTestLossAve[j][-2000:-1].mean() - GDTrainLossAve[j][-2000:-1].mean()))
    print("GD {} Moment is {}".format(j+1, GDSecondMoment[j][-2000:-1].mean()))
    print("GD {} Accumulated Gradient Variance is {}".format(j+1, GDAccGradientVarianceAve[j][-2000:-1].mean()))
    print("GD {} Training Loss Variance is {}".format(j + 1, GDBootstrapLossAve[j][-2000:-1].mean()))
    print("GD {} Training Loss is {}".format(j + 1, GDTrainLossAve[j][-2000:-1].mean()))
    print("GD {} Trace Product is {}".format(j + 1, GDProductsAve[j][-2000:-1].mean()))
    print("GD {} Hessian Frobenius is {}".format(j + 1, GDHessianFrobeniusesAve[j][-2000:-1].mean()))
    print("GD {} Hessian Trace is {}".format(j + 1, GDHessianTracesAve[j][-2000:-1].mean()))
    print("GD {} Covariance Matrix Trace is {}".format(j + 1, GDCovarianceTracesAve[j][-2000:-1].mean()))
    print("GD {} Accumulated Trace Product is {}".format(j + 1, GDAccProductsAve[j][-2000:-1].mean()))

for j in range(NumExp):
    print("SGD {} Generalization Gap is {}".format(j+1, SGDTestLossAve[j][-2000:-1].mean() - SGDTrainLossAve[j][-2000:-1].mean()))
    print("SGD {} Moment is {}".format(j+1, SGDSecondMoment[j][-2000:-1].mean()))
    print("SGD {} Accumulated Gradient Variance is {}".format(j+1, SGDAccGradientVarianceAve[j][-2000:-1].mean()))
    print("SGD {} Training Loss Variance is {}".format(j + 1, SGDBootstrapLossAve[j][-2000:-1].mean()))
    print("SGD {} Training Loss is {}".format(j + 1, SGDTrainLossAve[j][-2000:-1].mean()))
    print("SGD {} Trace Product is {}".format(j + 1, SGDProductsAve[j][-2000:-1].mean()))
    print("SGD {} Hessian Frobenius is {}".format(j + 1, SGDHessianFrobeniusesAve[j][-2000:-1].mean()))
    print("SGD {} Hessian Trace is {}".format(j + 1, SGDHessianTracesAve[j][-2000:-1].mean()))
    print("SGD {} Covariance Matrix Trace is {}".format(j + 1, SGDCovarianceTracesAve[j][-2000:-1].mean()))
    print("SGD {} Accumulated Trace Product is {}".format(j + 1, SGDAccProductsAve[j][-2000:-1].mean()))

'''
for j in range(NumExp):
    print("CVSGD {} Generalization Gap is {}".format(j+1, CVSGDTestLossAve[j][-2000:-1].mean() - CVSGDTrainLossAve[j][-2000:-1].mean()))
    print("CVSGD {} Moment is {}".format(j+1, CVSGDSecondMoment[j][-2000:-1].mean()))
    print("CVSGD {} Accumulated Gradient Variance is {}".format(j+1, CVSGDAccGradientVarianceAve[j][-2000:-1].mean()))
    print("CVSGD {} Training Loss Variance is {}".format(j + 1, CVSGDBootstrapLossAve[j][-2000:-1].mean()))
    print("CVSGD {} Training Loss is {}".format(j + 1, CVSGDTrainLossAve[j][-2000:-1].mean()))
    print("CVSGD {} Trace Product is {}".format(j + 1, CVSGDProductsAve[j][-2000:-1].mean()))
    print("CVSGD {} Hessian Frobenius is {}".format(j + 1, CVSGDHessianFrobeniusesAve[j][-2000:-1].mean()))
    print("CVSGD {} Hessian Trace is {}".format(j + 1, CVSGDHessianTracesAve[j][-2000:-1].mean()))
    print("CVSGD {} Covariance Matrix Trace is {}".format(j + 1, CVSGDCovarianceTracesAve[j][-2000:-1].mean()))

for j in range(NumExp):
    print("ANTICVSGD {} Generalization Gap is {}".format(j+1, ANTICVSGDTestLossAve[j][-2000:-1].mean() - ANTICVSGDTrainLossAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Moment is {}".format(j+1, ANTICVSGDSecondMoment[j][-2000:-1].mean()))
    print("ANTICVSGD {} Accumulated Gradient Variance is {}".format(j+1, ANTICVSGDAccGradientVarianceAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Training Loss Variance is {}".format(j + 1, ANTICVSGDBootstrapLossAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Training Loss is {}".format(j + 1, ANTICVSGDTrainLossAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Trace Product is {}".format(j + 1, ANTICVSGDProductsAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Hessian Frobenius is {}".format(j + 1, ANTICVSGDHessianFrobeniusesAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Hessian Trace is {}".format(j + 1, ANTICVSGDHessianTracesAve[j][-2000:-1].mean()))
    print("ANTICVSGD {} Covariance Matrix Trace is {}".format(j + 1, ANTICVSGDCovarianceTracesAve[j][-2000:-1].mean()))
'''